/*******************************************************************************																	

	DESCRIPTION: 	This do file creates a correlation matrix of the different
					prediction algorithms at different unemployment durations.

*******************************************************************************/

clear all
global id_code 141

********************** 1. TIME SERIES ******************************************

* Create frame to hold correlation information:
frame create pred_corr year str30(model outcome) double(corr_rf_boost corr_rf_lasso corr_boost_lasso ///
	cov_rf_boost cov_rf_lasso cov_boost_lasso)

* Loop over years:
forval year = 1992/2016 {
	
	* Import data:
	use "$data/003_MainWithEnsemblePred_Full_`year'.dta", clear
	
	* Loop over unemployment durations:
	foreach outcome in p_emplAft6M_0M_In p_emplAft6M_6M_In p_emplAft6M_12M_In {
		
		* Compute correlations:
		qui corr `outcome'_rf `outcome'_boost
		local c1 = r(rho)
		
		qui corr `outcome'_rf `outcome'_lasso
		local c2 = r(rho)

		qui corr `outcome'_boost `outcome'_lasso
		local c3 = r(rho)
				
		* Same but covariance:
		qui corr `outcome'_rf `outcome'_boost, covariance
		local c4 = r(cov_12)
		
		qui corr `outcome'_rf `outcome'_lasso, covariance
		local c5 = r(cov_12)

		qui corr `outcome'_boost `outcome'_lasso, covariance
		local c6 = r(cov_12)
		
		* Post to frame:
		frame post pred_corr (`year') ("Model") ("`outcome'") (`c1') (`c2') (`c3') (`c4') (`c5') (`c6')
	}
}

* Save:
frame pred_corr: save "${output}/${id_code}_ML_Models_Correlation.dta", replace

* Import:
use "${output}/${id_code}_ML_Models_Correlation.dta", clear

* Add labels:
label variable corr_rf_boost "R. Forest - B. Gradient"
label variable corr_rf_lasso "R. Forest - LASSO"
label variable corr_boost_lasso "B. Gradient - LASSO"
label variable year "Year"

cap drop outcome_nice
gen outcome_nice =  1 if outcome == "p_emplAft6M_0M_In"
replace outcome_nice = 2 if outcome == "p_emplAft6M_6M_In"
replace outcome_nice = 3 if outcome == "p_emplAft6M_12M_In"

cap label drop outcome_nice
label define outcome_nice 1 "At Start of Spell" 2 "6M into Spell" ///
	3 "12M into Spell" 
label values outcome_nice outcome_nice


* Plot:
twoway (connected corr_rf_boost year) ///
	(connected corr_rf_lasso year) ///
	(connected corr_boost_lasso year) ///
	, ///
	xline(2006, lcolor(gray) lpatter(dash)) ///
	by(outcome_nice, graphregion(color(white)) note("") holes(2) legend(at(2) pos(0))) ///
	ytitle("Correlation Coefficient") ///
	ylabel( 0(0.2)1, angle(0) format(%5.2f)) ///
	xlabel(1992(4)2016, labsize(small)) ///
	legend(cols(1) size(small) symxsize(*0.5)) ///
	graphregion(color(white)) name(scatt1, replace)
	
graph export "${output}/${id_code}_ML_Models_Correlation_Scatterplot.pdf", replace

* Plot:
twoway (connected cov_rf_boost year) ///
	(connected cov_rf_lasso year) ///
	(connected cov_boost_lasso year) ///
	, ///
	xline(2006, lcolor(gray) lpatter(dash)) ///
	by(outcome_nice, graphregion(color(white)) note("") holes(2) legend(at(2) pos(0))) ///
	ytitle("Covariance") ///
	ylabel(, angle(0) format(%5.2f)) ///
	xlabel(1992(4)2016, labsize(small)) ///
	legend(cols(1) size(small) symxsize(*0.5)) ///
	graphregion(color(white)) name(scatt2, replace)
	
graph export "${output}/${id_code}_ML_Models_Covariance_Scatterplot.pdf", replace


********************** 2. HEATMAP FOR 2006 *************************************

* Import baseline model predictions:	
use "${data}/003_MainWithEnsemblePred_Full_2006.dta", clear
keep if !missing(p_emplAft6M_0M_In)

rename p_emplAft6M_0M_In p_emplAft6M_0M_In_Base

* Merge with linear model:
merge 1:1 LopNr_PersonNr InLnr ///
	using "${data}/116_Linear_Predictions_Full_2006.dta" ///
	, assert(2 3) keep(3) nogen keepusing(p_emplAft6M_0M_In_Linear)
	
* Label predictions:
label variable p_emplAft6M_0M_In_Base "Ensemble"
label variable p_emplAft6M_0M_In_rf "R. Forest"
label variable p_emplAft6M_0M_In_boost "B. Gradient"
label variable p_emplAft6M_0M_In_lasso "LASSO"
label variable p_emplAft6M_0M_In_Linear "Linear"

* Compute correlation matrix:
corr p_emplAft6M_0M_In*

heatplot r(C), values(size(vsmall) format(%9.2f)) cuts(-1(.1)1) label ///
	colors(ebblue white orange_red, ipolate(20)) ///
	ramp(right subtitle("Correlation" "Coefficient") format(%9.2f) ///
		labels(-1 0 1) graphregion(color(white)) ///
		combine(graphregion(color(white)))) ///
	ylabel(, nogrid labsize(vsmall) angle(0)) ///
	xlabel(, labsize(vsmall) angle(0)) ///
	title("Baseline Model, 2006, At Start of Spell") ///
	graphregion(color(white)) name(heat, replace)	
	
graph export "${output}/${id_code}_ML_Models_Correlation_Heatmap_2006_emplAft6M_0M_In.pdf", replace
